import pandas as pd
from bokeh.palettes import viridis
import matplotlib.pylab as plt
import pandas as pd
import seaborn as sns
import numpy as np
import re
import nltk.tag as tag
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
import pyLDAvis
import pyLDAvis.gensim
import pickle
import os
import codecs
import spacy
from spacy.lang.en import STOP_WORDS
import pandas as pd
import itertools as it
from sklearn.manifold import TSNE
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
import nltk
from nltk.tokenize import WordPunctTokenizer
import os
import string
translator = str.maketrans('', '', string.punctuation)
import nltk.tag as tag
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models import word2vec
%matplotlib inline
# Download and unpack the file from https://www.kaggle.com/zynicide/wine-reviews/data
df = pd.read_csv('/data2/text_datasets/winedata.csv', index_col=0)
df['year'] = df['title'].str.extract('(\d+)')
df.isnull().any()
df.head(3)
# Let's see the top 15 Countries and varieties
#print(df.country.value_counts()[:15])
#print('\n')
#print(df.variety.value_counts()[:15])
#print('\n')
#print(df.taster_name.value_counts()[:15])
#print('\n')
print(df.year.value_counts()[:15])
The following set of plots will have a bar plot and box plot grouped by different categories. The bar plots have counts in descending order where as no ordering has been used in Box plots. As a result, the colors between the two plots do not correspond to the same value.
Also, when plotting the box plots, in some cases I am showing a smaller subset when the counts for some category values are very low.
tmp = pd.DataFrame(df.country.value_counts()[:30])
tmp.columns = ['Count']
plt.figure(figsize=(20,7))
# plot barh chart with index as x values
ax = sns.barplot(tmp.index, tmp.Count)
ax.get_yaxis().set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x))))
ax.set(xlabel="Country", ylabel='Count')
# add proper Dim values as x labels
for item in ax.get_xticklabels(): item.set_rotation(90)
for i, v in enumerate(tmp["Count"].iteritems()):
ax.text(i ,v[1], "{:,}".format(v[1]), color='m', va ='bottom', rotation=45)
plt.tight_layout()
plt.show()
plt.subplots(figsize=(20,5))
ax = sns.boxplot(x="country", y="points", data=df.loc[(df['country'].isin(list(df.country.value_counts()[:15].index)))])
plt.xticks(rotation=45)
tmp = pd.DataFrame(df.winery.value_counts()[:30])
tmp.columns = ['Count']
plt.figure(figsize=(20,7))
# plot barh chart with index as x values
ax = sns.barplot(tmp.index, tmp.Count)
ax.get_yaxis().set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x))))
ax.set(xlabel="Winery", ylabel='Count')
# add proper Dim values as x labels
for item in ax.get_xticklabels(): item.set_rotation(90)
for i, v in enumerate(tmp["Count"].iteritems()):
ax.text(i ,v[1], "{:,}".format(v[1]), color='m', va ='bottom', rotation=45)
plt.tight_layout()
plt.show()
plt.subplots(figsize=(20,5))
ax = sns.boxplot(x="winery", y="points", data=df.loc[(df['winery'].isin(list(df.winery.value_counts()[:25].index)))])
plt.xticks(rotation=60)
tmp = pd.DataFrame(df.taster_name.value_counts()[:30])
tmp.columns = ['Count']
plt.figure(figsize=(20,7))
# plot barh chart with index as x values
ax = sns.barplot(tmp.index, tmp.Count)
ax.get_yaxis().set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x))))
ax.set(xlabel="Taster Name", ylabel='Count')
# add proper Dim values as x labels
for item in ax.get_xticklabels(): item.set_rotation(90)
for i, v in enumerate(tmp["Count"].iteritems()):
ax.text(i ,v[1], "{:,}".format(v[1]), color='m', va ='bottom', rotation=45)
plt.tight_layout()
plt.show()
plt.subplots(figsize=(20,5))
ax = sns.boxplot(x="taster_name", y="points", data=df.loc[(df['taster_name'].isin(list(df.taster_name.value_counts()[:30].index)))])
plt.xticks(rotation=45)
tmp = df.groupby(['country','variety']).size().reset_index()
tmp.columns = ['country','variety','count']
tmp = tmp.sort_values(by=['count'], ascending=False)[:100]
tmp = tmp.dropna()
result = tmp.pivot(index='country', columns='variety', values='count')
plt.subplots(figsize=(20,5))
sns.heatmap(result, cmap="YlGnBu")
tmp = df.groupby(['country','variety'])[['points']].median().reset_index()
tmp.columns = ['country','variety','points']
tmp = tmp.sort_values(by=['points'], ascending=False)[:100]
tmp = tmp.dropna()
result = tmp.pivot(index='country', columns='variety', values='points')
plt.subplots(figsize=(20,5))
sns.heatmap(result, cmap="YlGnBu")
tmp = df.groupby(['taster_name','country'])[['points']].median().reset_index()
tmp.columns = ['taster','country','points']
tmp = tmp.sort_values(by=['points'], ascending=False)[:100]
tmp = tmp.dropna()
result = tmp.pivot(index='taster', columns='country', values='points')
plt.subplots(figsize=(20,5))
sns.heatmap(result, cmap="YlGnBu")
tmp = df.groupby(['taster_name','variety'])[['points']].median().reset_index()
tmp.columns = ['taster','variety','points']
tmp = tmp.sort_values(by=['points'], ascending=False)[:100]
tmp = tmp.dropna()
result = tmp.pivot(index='taster', columns='variety', values='points')
plt.subplots(figsize=(20,5))
sns.heatmap(result, cmap="YlGnBu")
tmp = df.groupby(['taster_name','country'])[['points']].median().reset_index()
tmp.columns = ['taster','country','points']
tmp = tmp.sort_values(by=['points'], ascending=False)[:100]
tmp = tmp.dropna()
result = tmp.pivot(index='taster', columns='country', values='points')
plt.subplots(figsize=(20,5))
sns.heatmap(result, cmap="YlGnBu")
tmp = df.groupby(['taster_name','country']).size().reset_index()
tmp.columns = ['taster','country','count']
tmp = tmp.sort_values(by=['count'], ascending=False)[:100]
tmp = tmp.dropna()
result = tmp.pivot(index='taster', columns='country', values='count')
plt.subplots(figsize=(20,5))
sns.heatmap(result, cmap="YlGnBu")
tmp = df.groupby(['taster_name','winery'])[['points']].median().reset_index()
tmp.columns = ['taster','winery','points']
tmp = tmp.sort_values(by=['points'], ascending=False)[:100]
tmp = tmp.dropna()
result = tmp.pivot(index='taster', columns='winery', values='points')
plt.subplots(figsize=(20,5))
sns.heatmap(result, cmap="YlGnBu")
plt.subplots(figsize=(20,5))
sns.regplot(x="points", y="price", data=df,order=3, truncate = True,x_estimator=np.mean)
stopArr = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
stoplist = ' '.join(map(str, stopArr))
def token_lemmatize(text):
text = text.translate(translator)
#print(text)
review = sent_tokenize(text.lower())
out = []
for sent in review:
new_sent = []
for word in tag.pos_tag(word_tokenize(sent)):
if word[0] not in stoplist:
new_sent.append(lemmatizer.lemmatize(word[1], word[0].lower()))
if len(new_sent) > 0:
out.append(new_sent)
return out
df['description_token'] = df['description'].apply(token_lemmatize)
df.to_pickle('/data2/text_datasets/winedata.p')
#Read from previously saved dataframe
#df = pd.read_pickle('/data2/text_datasets/winedata.p')
df.head(3)
sents = []
for sent in df['description_token']:
sents += sent
# saving for later use
with open('/data2/text_datasets/winedescriptiontokens.p', 'wb') as f:
pickle.dump(sents, f)
model = word2vec.Word2Vec(sents, size=300, window=5, min_count=5)
model.save('/data2/text_models/word2vec/gensim/winedemo/model')
#Load previously saved model
#model = word2vec.Word2Vec.load('/data2/text_models/word2vec/gensim/winedemo/model') # you can continue training with the loaded model!
def get_related_terms(token, topn=10):
"""
look up the topN most similar terms to token
and print them as a formatted list
"""
for word, similarity in model.wv.most_similar(positive=[token], topn=topn):
print(u'{:20} {}'.format(word, round(similarity, 3)))
def word_algebra(add=[], subtract=[], topn=1):
"""
combine the vectors associated with the words provided
in add= and subtract=, look up the topn most similar
terms to the combined vector, and print the result(s)
"""
answers = model.wv.most_similar(positive=add, negative=subtract, topn=topn)
for term, similarity in answers:
print(term)
len(model.wv.vocab)
get_related_terms(u'cherry')
get_related_terms(u'apple')
get_related_terms(u'peat')
get_related_terms(u'earthy')
get_related_terms(u'napa')
get_related_terms(u'burgundy')
word_algebra(add=[u'seafood', u'cabernet'], subtract=[u'merlot'])
* if you add steak to night and subtract day from it, it becomes lamb
* if you add fish to night and subtract day from it, it becomes chicken
word_algebra(add=[u'steak', u'night'], subtract=[u'day'])
word_algebra(add=[u'fish', u'night'], subtract=[u'day'])
* If you take acidity out from a oak barrel merlot, it tastes like a verdot :)
word_algebra(add=[u'oak', u'merlot'], subtract=[u'acidic'])
"Word2vec takes as its input a large corpus of text and produces a vector space, typically of several hundred dimensions, with each unique word in the corpus being assigned a corresponding vector in the space. Word vectors are positioned in the vector space such that words that share common contexts in the corpus are located in close proximity to one another in the space.
* Build a list of the terms, integer indices,and term counts from the model vocabulary
* Sort by the term counts, so the most common terms appear first
* Unzip the terms, integer indices, and counts into separate lists
* Create a DataFrame with the vectors as data,and the terms as row labels
* Take a subset of this to include only the Top 5000 terms. This is called as tsne_input
* Initiate TSNE object
* Perform the fit_transform function using the values of dataframe tsne_input. This will return tnse_vectors that is converted to dataframe
* Create a new column called word in tsne_vectors that has the index values
* save dataframe tsne_vectors for later use
ordered_vocab = [(term, voc.index, voc.count) for term, voc in model.wv.vocab.items()]
ordered_vocab = sorted(ordered_vocab, key=lambda k: -k[2])
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)
word_vectors = pd.DataFrame(model.wv.syn0[term_indices, :], index=ordered_terms)
tsne_input = word_vectors
tsne_input = tsne_input.head(5000)
tsne = TSNE()
tsne_vectors = tsne.fit_transform(tsne_input.values)
tsne_vectors = pd.DataFrame(tsne_vectors,
index=pd.Index(tsne_input.index),
columns=[u'x_coord', u'y_coord'])
tsne_vectors[u'word'] = tsne_vectors.index
tsne_vectors.to_pickle('/data2/text_datasets/winedata.p')
#tsne_vectors = pd.read_pickle('/data2/text_datasets/tsne_vectors.p')
tsne_vectors.head()
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value
from bokeh.resources import INLINE
output_notebook(INLINE)
#bokeh.io.output_notebook(INLINE)
# add our DataFrame as a ColumnDataSource for Bokeh
plot_data = ColumnDataSource(tsne_vectors)
# create the plot and configure the
# title, dimensions, and tools
tsne_plot = figure(title=u't-SNE Word Embeddings',
plot_width = 800,
plot_height = 800,
tools= (u'pan, wheel_zoom, box_zoom,'
u'box_select, reset'),
active_scroll=u'wheel_zoom')
# add a hover tool to display words on roll-over
tsne_plot.add_tools( HoverTool(tooltips = u'@word') )
# draw the words as circles on the plot
tsne_plot.circle(u'x_coord', u'y_coord', source=plot_data,
color=u'blue', line_alpha=0.2, fill_alpha=0.1,
size=10, hover_line_color=u'black')
# configure visual elements of the plot
tsne_plot.title.text_font_size = value(u'16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None
# engage!
show(tsne_plot);
tsne_vectors[tsne_vectors.word.str.contains('verdot') | tsne_vectors.word.str.contains('merlot') | tsne_vectors.word.str.contains('pinot')]
We will compare reviews for :
pinot_df = df.loc[(df['variety'] == 'Pinot Noir') & (df['year'] == '2015')]
pinot_df = pinot_df[['taster_name','description','points','title','description_token','province']]
pinot_df = pinot_df.loc[pinot_df['taster_name'].isin(pinot_df.taster_name.value_counts()[:10].index)]
pinot_df.taster_name.value_counts()[:10].index
* Points scored is currently a discrete variable. Adding little bit of random noise to the points scored so no 2 wines have the exact same score. This will help in rank ordering later on.
pinot_df['points_randn'] = np.random.normal(0, .1, pinot_df.shape[0])
pinot_df['points_randn'] = pinot_df['points'] + pinot_df['points_randn']
pinot_df["group_rank"] = pinot_df.groupby("taster_name")["points_randn"].rank(ascending=0,method='dense')
pinot_df_topranks = pinot_df.loc[(pinot_df['group_rank'] == 1)]
pinot_df_topranks = pinot_df_topranks.reset_index()
Normalizing (L2 norm) the word2vec vectors first to bring them on the same scale (sum of squares = 1). This is implemented in gensim and can be called using the following:
model.init_sims(replace=True)
If replace is set, forget the original vectors and only keep the normalized ones = saves lots of memory!. Note that you cannot continue training or inference after doing a replace. The model becomes effectively read-only = you can call most_similar, similarity etc., but not train or infer_vector.
model.init_sims(replace=True)
word_vectors = model.wv
print(sum(word_vectors['cherry']**2))
print(sum(word_vectors['verdot']**2))
sim_score = []
for doc in pinot_df_topranks['description_token']:
doc = doc[0]
#print(doc)
for other_doc in pinot_df_topranks['description_token']:
other_doc = other_doc[0]
sim_score.append(model.wv.wmdistance(doc, other_doc))
sim_mat = np.array(sim_score).reshape((10, 10))
mask = np.zeros_like(sim_mat, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap="YlGnBu"
sns.heatmap(sim_mat, mask=mask, cmap=cmap, vmax=.3, center=0.5,
square=True, linewidths=.5, cbar_kws={"shrink": .5},annot=True, annot_kws={"size": 7})
print('1:\n')
print(pinot_df_topranks.description[1])
print('\n4:\n')
print(pinot_df_topranks.description[4])
print('\n8:\n')
print(pinot_df_topranks.description[8])
print('\n7:\n')
print(pinot_df_topranks.description[7])
* Checking reviewers that have the most diverse coverage in terms of provinces. Michael Schachner, Anna Lee C. Iijima seem to be good candidates
print(pinot_df.groupby(['province','taster_name']).size())
pinot_df_MS = pinot_df.loc[pinot_df['taster_name'] == 'Michael Schachner']
pinot_df_MS = pinot_df_MS.loc[pinot_df_MS['province'].isin(pinot_df_MS.province.value_counts()[:10].index)]
print(pinot_df_MS.groupby(['province']).size())
pinot_df_MS["group_rank2"] = pinot_df_MS.groupby("province")["points_randn"].rank(ascending=0,method='dense')
pinot_df_MS_topranks = pinot_df_MS.loc[(pinot_df_MS['group_rank2'] == 1)]
pinot_df_MS_topranks = pinot_df_MS_topranks.reset_index()
sim_score = []
for doc in pinot_df_MS_topranks['description_token']:
doc = doc[0]
#print(doc)
for other_doc in pinot_df_MS_topranks['description_token']:
other_doc = other_doc[0]
sim_score.append(model.wv.wmdistance(doc, other_doc))
sim_mat = np.array(sim_score).reshape((10, 10))
mask = np.zeros_like(sim_mat, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap="YlGnBu"
#cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(sim_mat, mask=mask, cmap=cmap, vmax=.3, center=0.5,
square=True, linewidths=.5, cbar_kws={"shrink": .5},annot=True, annot_kws={"size": 7} )
pinot_df_AK = pinot_df.loc[pinot_df['taster_name'] == 'Anna Lee C. Iijima']
pinot_df_AK = pinot_df_AK.loc[pinot_df_AK['province'].isin(pinot_df_AK.province.value_counts()[:10].index)]
pinot_df_AK["group_rank2"] = pinot_df_AK.groupby("province")["points_randn"].rank(ascending=0,method='dense')
pinot_df_AK_topranks = pinot_df_AK.loc[(pinot_df_AK['group_rank2'] == 1)]
pinot_df_AK_topranks = pinot_df_AK_topranks.reset_index()
print(pinot_df_AK.groupby(['province']).size())
pinot_df_AK_topranks
sim_score = []
for doc in pinot_df_AK_topranks['description_token']:
doc = doc[0]
#print(doc)
for other_doc in pinot_df_AK_topranks['description_token']:
other_doc = other_doc[0]
sim_score.append(model.wv.wmdistance(doc, other_doc))
sim_mat = np.array(sim_score).reshape((5, 5))
mask = np.zeros_like(sim_mat, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap="YlGnBu"
#cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(sim_mat, mask=mask, cmap=cmap, vmax=.3, center=0.5,
square=True, linewidths=.5, cbar_kws={"shrink": .5},annot=True, annot_kws={"size": 7} )
print('0:\n')
print(pinot_df_AK_topranks.description[0])
print('\n2:\n')
print(pinot_df_AK_topranks.description[2])
print('\n')
print('#################################')
print(' The two reviews talk about a white wine from Pinot Noir. Dryness. Meant to be consumed now.')
* We see that the distances have slighly reduced now that we have narrowed down to a single reviewer. Next we will apply another filter to keep the same variety (Pinot Noir) from a single province only and see the effect on the distances.
pinot_df_MS = pinot_df.loc[pinot_df['province'] == 'Casablanca Valley']
sim_score = []
for doc in pinot_df_MS['description_token']:
doc = doc[0]
#print(doc)
for other_doc in pinot_df_MS['description_token']:
other_doc = other_doc[0]
sim_score.append(model.wv.wmdistance(doc, other_doc))
sim_mat = np.array(sim_score).reshape((12, 12))
mask = np.zeros_like(sim_mat, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap="YlGnBu"
#cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(sim_mat, mask=mask, cmap=cmap, vmax=.3, center=0.5,
square=True, linewidths=.5, cbar_kws={"shrink": .5},annot=True, annot_kws={"size": 7} )
nlp = spacy.load('en')
intermediate_directory = os.path.join(os.path.join('', 'data'),'intermediate')
review_txt_filepath = os.path.join(intermediate_directory,
'review_text_all.txt')
%%time
# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 1 == 1:
review_count = 0
# create & open a new file in write mode
with codecs.open(review_txt_filepath, 'w', encoding='utf_8') as review_txt_file:
# loop through all reviews in the existing file and convert to dict
for review in df.description:
# write the wine review as a line in the new file
# escape newline characters in the original review text
review_txt_file.write(review.replace('\n', '\\n') + '\n')
review_count += 1
else:
with codecs.open(review_txt_filepath, encoding='utf_8') as review_txt_file:
for review_count, line in enumerate(review_txt_file):
pass
#print u'Text from {:,} restaurant reviews in the txt file.'.format(review_count + 1)
with codecs.open(review_txt_filepath, encoding='utf_8') as f:
sample_review = list(it.islice(f, 8, 9))[0]
sample_review = sample_review.replace('\\n', '\n')
print(sample_review)
for num, sentence in enumerate(parsed_review.sents):
print('Sentence {}:'.format(num + 1))
print(sentence)
print('')
for num, entity in enumerate(parsed_review.ents):
print('Entity {}:'.format(num + 1), entity, '-', entity.label_)
print('')
%%time
parsed_review = nlp(sample_review)
token_text = [token.orth_ for token in parsed_review]
token_pos = [token.pos_ for token in parsed_review]
pd.DataFrame({'token_text': token_text,
'token_pos': token_pos
})
token_lemma = [token.lemma_ for token in parsed_review]
token_shape = [token.shape_ for token in parsed_review]
pd.DataFrame({'token_text': token_text,
'token_lemma': token_lemma,
'token_shape' : token_shape
})
def punct_space(token):
"""
helper function to eliminate tokens
that are pure punctuation or whitespace
"""
return token.is_punct or token.is_space
def line_review(filename):
"""
generator function to read in reviews from the file
and un-escape the original line breaks in the text
"""
with codecs.open(filename, encoding='utf_8') as f:
for review in f:
yield review.replace('\\n', '\n')
def lemmatized_sentence_corpus(filename):
"""
generator function to use spaCy to parse reviews,
lemmatize the text, and yield sentences
"""
for parsed_review in nlp.pipe(line_review(filename),
batch_size=10000, n_threads=4):
for sent in parsed_review.sents:
yield u' '.join([token.lemma_ for token in sent
if not punct_space(token)])
unigram_sentences_filepath = os.path.join(intermediate_directory,
'unigram_sentences_all.txt')
%%time
# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 1:
with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
for sentence in lemmatized_sentence_corpus(review_txt_filepath):
f.write(sentence + '\n')
unigram_sentences = LineSentence(unigram_sentences_filepath)
for unigram_sentence in it.islice(unigram_sentences, 210, 240):
print(u' '.join(unigram_sentence))
print(u'')
bigram_model_filepath = os.path.join(intermediate_directory, 'bigram_model_all')
%%time
# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if 0 == 1:
bigram_model = Phrases(unigram_sentences)
bigram_model.save(bigram_model_filepath)
# load the finished model from disk
bigram_model = Phrases.load(bigram_model_filepath)
bigram_sentences_filepath = os.path.join(intermediate_directory,
'bigram_sentences_all.txt')
%%time
# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 1:
with codecs.open(bigram_sentences_filepath, 'w', encoding='utf_8') as f:
for unigram_sentence in unigram_sentences:
bigram_sentence = u' '.join(bigram_model[unigram_sentence])
f.write(bigram_sentence + '\n')
bigram_sentences = LineSentence(bigram_sentences_filepath)
for bigram_sentence in it.islice(bigram_sentences, 210, 240):
print(u' '.join(bigram_sentence))
print(u'')
trigram_model_filepath = os.path.join(intermediate_directory,
'trigram_model_all')
%%time
# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if 0 == 1:
trigram_model = Phrases(bigram_sentences)
trigram_model.save(trigram_model_filepath)
# load the finished model from disk
trigram_model = Phrases.load(trigram_model_filepath)
trigram_sentences_filepath = os.path.join(intermediate_directory,
'trigram_sentences_all.txt')
%%time
# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 1:
with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:
for bigram_sentence in bigram_sentences:
trigram_sentence = u' '.join(trigram_model[bigram_sentence])
f.write(trigram_sentence + '\n')
trigram_sentences = LineSentence(trigram_sentences_filepath)
for trigram_sentence in it.islice(trigram_sentences, 210, 240):
print(u' '.join(trigram_sentence))
print(u'')
trigram_reviews_filepath = os.path.join(intermediate_directory,
'trigram_transformed_reviews_all.txt')
%%time
# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 1:
with codecs.open(trigram_reviews_filepath, 'w', encoding='utf_8') as f:
for parsed_review in nlp.pipe(line_review(review_txt_filepath),
batch_size=10000, n_threads=4):
# lemmatize the text, removing punctuation and whitespace
unigram_review = [token.lemma_ for token in parsed_review
if not punct_space(token)]
# apply the first-order and second-order phrase models
bigram_review = bigram_model[unigram_review]
trigram_review = trigram_model[bigram_review]
# remove any remaining stopwords
trigram_review = [term for term in trigram_review
if term not in STOP_WORDS]
# write the transformed review as a line in the new file
trigram_review = u' '.join(trigram_review)
f.write(trigram_review + '\n')
print( u'Original:' + u'\n')
for review in it.islice(line_review(review_txt_filepath), 11, 12):
print(review)
print(u'----' + u'\n')
print(u'Transformed:' + u'\n')
with codecs.open(trigram_reviews_filepath, encoding='utf_8') as f:
for review in it.islice(f, 11, 12):
print(review)
Topic modeling is family of techniques that can be used to describe and summarize the documents in a corpus according to a set of latent "topics". For this demo, we'll be using Latent Dirichlet Allocation or LDA, a popular approach to topic modeling.
In many conventional NLP applications, documents are represented a mixture of the individual tokens (words and phrases) they contain. In other words, a document is represented as a vector of token counts. There are two layers in this model — documents and tokens — and the size or dimensionality of the document vectors is the number of tokens in the corpus vocabulary. This approach has a number of disadvantages:
Document vectors tend to be large (one dimension for each token ⇒ lots of dimensions) They also tend to be very sparse. Any given document only contains a small fraction of all tokens in the vocabulary, so most values in the document's token vector are 0. The dimensions are fully indepedent from each other — there's no sense of connection between related tokens, such as knife and fork. LDA injects a third layer into this conceptual model. Documents are represented as a mixture of a pre-defined number of topics, and the topics are represented as a mixture of the individual tokens in the vocabulary. The number of topics is a model hyperparameter selected by the practitioner. LDA makes a prior assumption that the (document, topic) and (topic, token) mixtures follow Dirichlet probability distributions. This assumption encourages documents to consist mostly of a handful of topics, and topics to consist mostly of a modest set of the tokens.
intermediate_directory = os.path.join(os.path.join('', 'data'),'intermediate')
trigram_dictionary_filepath = os.path.join(intermediate_directory,'trigram_dict_all.dict')
%%time
# this is a bit time consuming - make the if statement True
# if you want to learn the dictionary yourself.
if 0 == 1:
trigram_reviews = LineSentence(trigram_reviews_filepath)
# learn the dictionary by iterating over all of the reviews
trigram_dictionary = Dictionary(trigram_reviews)
# filter tokens that are very rare or too common from
# the dictionary (filter_extremes) and reassign integer ids (compactify)
trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
trigram_dictionary.compactify()
trigram_dictionary.save(trigram_dictionary_filepath)
# load the finished dictionary from disk
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)
Like many NLP techniques, LDA uses a simplifying assumption known as the bag-of-words model. In the bag-of-words model, a document is represented by the counts of distinct terms that occur within it. Additional information, such as word order, is discarded.
Using the gensim Dictionary we learned to generate a bag-of-words representation for each review. The trigram_bow_generator function implements this. We'll save the resulting bag-of-words reviews as a matrix.
In the following code, "bag-of-words" is abbreviated as bow.
trigram_bow_filepath = os.path.join(intermediate_directory,
'trigram_bow_corpus_all.mm')
def trigram_bow_generator(filepath):
"""
generator function to read reviews from a file
and yield a bag-of-words representation
"""
for review in LineSentence(filepath):
yield trigram_dictionary.doc2bow(review)
%%time
# this is a bit time consuming - make the if statement True
# if you want to build the bag-of-words corpus yourself.
if 0 == 1:
# generate bag-of-words representations for
# all reviews and save them as a matrix
MmCorpus.serialize(trigram_bow_filepath,
trigram_bow_generator(trigram_reviews_filepath))
# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)
With the bag-of-words corpus, we're finally ready to learn our topic model from the reviews. We simply need to pass the bag-of-words matrix and Dictionary from our previous steps to LdaMulticore as inputs, along with the number of topics the model should learn. For this demo, we're asking for 10 topics.
lda_model_filepath = os.path.join(intermediate_directory, 'lda_model_all')
%%time
# this is a bit time consuming - make the if statement True
# if you want to train the LDA model yourself.
if 0 == 1:
with warnings.catch_warnings():
warnings.simplefilter('ignore')
# workers => sets the parallelism, and should be
# set to your number of physical cores minus one
lda = LdaMulticore(trigram_bow_corpus,
num_topics=10,
id2word=trigram_dictionary,
workers=7)
lda.save(lda_model_filepath)
# load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)
def explore_topic(topic_number, topn=25):
"""
accept a user-supplied topic number and
print out a formatted list of the top terms
"""
print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')
for term, frequency in lda.show_topic(topic_number, topn=topn):
print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))
explore_topic(topic_number=0, topn = 20)
It's possible to go through and inspect each topic in the same way, and try to assign a human-interpretable label that captures the essence of each one. I've given it a shot for all 50 topics below.
topic_names = {0: u'cherries, tannins',
1: u'ripe, rich, age, character',
2: u'acidic red wine',
3: u'cherries ,black plums, tannins',
4: u'palate_finish_fresh_citrus',
5: u'white wines acidic palate',
6: u'ripe berry cherry',
7: u'aroma_notes_palate',
8: u'black_cherries fruits',
9: u'dry_acidic'
}
topic_names_filepath = os.path.join(intermediate_directory, 'topic_names.pkl')
with open(topic_names_filepath, 'wb') as f:
pickle.dump(topic_names, f)